import re
import pandas as pd
import collections
import numpy as np
import plotly.graph_objects as go
import string
Read in a bunch of tweets that were collected during the airing of the 'Red Wedding' episode of Game of Thrones from the file got_tweets.csv.
Parse all the hashtags out of the texts, then count their overall occurrences. What are the 10 most common hashtags? Make a bar chart of these top 10 hashtags! Create a plot of hashtag timelines for the most interesting and frequent hashtags!
got = pd.read_csv("got_tweets.csv")
got.head()
#Parsing hashtags
flatten = lambda t: [item for sublist in t for item in sublist]
#Make all str lowercase
got_lc = got.text.str.lower()
#removing all punctuations other than #s
for s in string.punctuation.replace("#",""):
got_lc = got_lc.str.replace(s,"")
#find all words starting with a hashtag
HASHTAGS = got_lc.str.findall(r'#.*?(?=\s|$)')
#Counting hashtag occurrances
pd.Series(flatten(HASHTAGS)).value_counts()[:10]
#Barchart of hashtags
cntr = collections.Counter( flatten(HASHTAGS))
ht, cnt = zip( *cntr.items() )
fig = go.Figure([go.Bar(x=ht, y=cnt)])
fig.update_yaxes(type="log")
fig.update_traces(marker_line_color='rgb(108,48,107)',
marker_line_width=0.5, opacity=0.6)
fig.update_layout(title="Barchart of hashtags",
yaxis_title="word frequency",)
fig.show()
#Hashtag timeline
#Linking the list of hashtags to dates
ht_df = pd.DataFrame([HASHTAGS, got.created_at]).T
timeline_list = []
#take every word in every list and link it with a datetime in a dict
for row in ht_df.index:
for word in ht_df.loc[row].text:
timeline_list.append({"text": word, "created_at": ht_df.loc[row].created_at})
timeline_df = pd.DataFrame(timeline_list, columns = ["text", "created_at"])
timeline_df
#The number of distinct hashtags:
len(ht)
#It would be wasteful to try to plot all 2570 hashtags' timelines.
# Instead if plotting all hashtags' timelines, I decided to plot only those which occur more than 50 times.
sum(np.array(cnt) > 50)
relevant_ht= np.array(ht)[(np.array(cnt) > 50)] #These are the hashtags which deemed to be relevant
print(relevant_ht)
#Filter the df by the relevant hashtags
timeline_df = timeline_df[timeline_df.text.isin(relevant_ht)]
timeline_df["created_at"] = pd.to_datetime(timeline_df["created_at"])
timeline_df
timeline_df
timeline_gb = timeline_df.groupby(["text","created_at"]).size()
timeline_gb #It is a multiindexed dataframe, where the indeces are the hashtags and the datetimes, and the values are the columns
fig = go.Figure()
for word in ht:
subdf = timeline_gb[timeline_gb.index.get_level_values('text') == word]
occ_by_time = subdf.values
t = subdf.index.get_level_values(1)
#sorted_occ = [occ for _,occ in sorted(zip(t,occ_by_time))]
#without it the plot would be messy - sometimes data is not sorted
fig.add_trace(go.Scatter(x=t, y=occ_by_time, name= word))
fig.update_layout(title="Hashtag timeline",
yaxis_title="word occurrence",)
fig.show()
On the graph above, we can see that the most frequent tweet was (obviously) #gameofthrones. By turning it off on the plot, there is a much more meaningful graph.
Firstly, I'd like to point out that there are huge gaps between 19:00-19:37 and 21:08-21:30.
Secondly, after 21:50 the number of hashtags is increasing. It might suggest, that there would have been an important part in the episode which led many people to share their experience about it, or it might mean that the show ended at that time. The latter is more probable, because the #getglue hashtag also increased during that time, which was a was a social networking website where users could check in that they watched a movie.
Tokenize the text of the tweets, and gather the 'real' words for each tweet.
By 'real' words, there should be:
# markCount word occurrences, make a histogram of the occurrences. What are the top words? Are they what you expected?
What crazy words did you get? Explain possible approaches, with which you could throw out this kind of junk text as well.
got2 = got.copy(deep = True) #making a deep copy, so it works independently
#Remove URLs
print("Before:\n",got2.text[5])
print("Num of http:", got2.text.map(lambda x: "http" in x).sum())
got2["text"] = got2["text"].map( lambda x: re.sub(r'http\S+', '', x) )
print("\nAfter:\n",got2.text[5])
print("Num of http:",got2.text.map(lambda x: "http" in x).sum())
print("Num of https:",got2.text.map(lambda x: "https" in x).sum())
#Remove #s
print("Before:\n",got2.text[0])
got2["text"] = got2.text.str.replace("#","")
print("After:\n",got2.text[0])
#Remove @UserMentions
print("Before:\n",got2.text[5])
got2["text"] = got2.text.map( lambda x: re.sub("@\S+","",x) )
print("\nAfter:\n",got2.text[5])
#Remove emojis, from https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def deEmojify(text):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',text)
got2["text"] = got2.text.map(deEmojify)
#remove punctiations
for s in string.punctuation.replace("#",""):
got2["text"] = got2.text.str.replace(s,"")
#Remove numbers
print("Before: ",got2.text.map(lambda x: any( d in x for d in string.digits) ).sum())
for d in string.digits:
got2["text"] = got2.text.str.replace(d,"")
print("After: ",got2.text.map(lambda x: any( d in x for d in string.digits) ).sum())
#make lowercase
got2["text"] = got2.text.str.lower()
TWEET_WORDS = flatten(got2.text.str.split(" ").to_list())
TWEET_WORDS = list(filter(lambda a: a != "", TWEET_WORDS)) # remove "" character
# histogram of words
cntr = collections.Counter( TWEET_WORDS )
ht, cnt = zip( *cntr.most_common())
fig = go.Figure([go.Bar(x=ht, y=cnt)])
fig.update_yaxes(type="log")
fig.update_traces(marker_line_color='rgb(108,48,107)',
marker_line_width=0.5, opacity=0.6)
fig.update_layout(title="Barchart of tweeted words",
yaxis_title="word frequency",)
fig.show()
cntr.most_common(30) #30 Most common words in the tweets.
As it can be seen above, that the most common words in these tweets were either closely related to the Game of Thrones show (its name or the name of the episode) or were stopwords.
Extract the stopword list for the English language with the help of nltk. Download the standard Brown Corpus also from nltk, count the relative frequency of stopwords in both the Brown Corpus and the GoT tweets. Make a scatterplot of your results, try to explain possible similarities and deviations. What is the correlation in the stopword frequencies of the two datasets?
import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords
en_stop = stopwords.words('english') # English stopwords
print(en_stop)
#nltk.download("brown")
from nltk.corpus import brown
len(brown.words())
#Filtering the corpora, so that counting the stopwords is faster
def filterStopword(word):
if word in en_stop:
return True
else:
return False
filtered_got = list(filter(filterStopword, TWEET_WORDS))
filtered_brown = list(filter(filterStopword, brown.words()))
#Comparison of frequencies of stopwords
cntr = collections.Counter( filtered_got )
ht_got, cnt_got = zip( *cntr.most_common())
cntr = collections.Counter( filtered_brown )
ht_brown, cnt_brown = zip( *cntr.most_common())
fig = go.Figure()
fig.add_trace(go.Bar(go.Bar(x=ht_got, y=cnt_got, name ="GoT corpus")))
fig.add_trace(go.Bar(go.Bar(x=ht_brown, y=cnt_brown, name = "Brown corpus")))
fig.update_yaxes(type="log")
fig.update_layout(title="Comparison of frequencies of stopwords",
yaxis_title="word frequency",)
It is quite apparent, that the both corpora has very similar distribution of stopwords because of the nature of the English language. Other than that, I claim there are two major differences between them.
Firstly, the GoT corpus has a lot more of the abbreviated stopword fragments (such as I'm instead of I am, they'd instead of they had), which left behind after removing punctuations. On the other hand, the Brown corpus still has.
A really common tool to visualize texts is a wordcloud. Find a suitable library and create a meaningful wordcloud of the GoT tweets (e.g. leave out punctuation, stopwords etc.)
#!pip install wordcloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
#filtering out stopwords
def filterOutStopword(word):
if word in en_stop:
return False
else:
return True
got_wo_stopwords = list(filter(filterOutStopword, TWEET_WORDS))
plt.figure(figsize=[12,6])
#generating the wordcloud, giving additional parameters
wordcloud = WordCloud(max_font_size=50, min_font_size=6, max_words=100, background_color="white")
wordcloud.generate(" ".join(got_wo_stopwords))
plt.imshow(wordcloud, interpolation="spline36")
# I feel the need of removing some of the redundant words (such as game of thrones) from the cloud,
#because it has no additional meaning
specific_sws = ["throne","thrones", "game", "gameofthrone", "gameofthrones", "got"]
for w in specific_sws:
STOPWORDS.add(w)
plt.figure(figsize=[12,6])
wordcloud = WordCloud(max_font_size=50, min_font_size=6, max_words=100, background_color="white", stopwords = STOPWORDS)
wordcloud.generate(" ".join(got_wo_stopwords))
plt.imshow(wordcloud, interpolation="lanczos")
mask = np.array(Image.open("got-head.png"))[:,:,3] #omitting the 3 additional channels - it is a monochrome picture
mask2 = np.array(Image.open("throne.png"))[:,:,3]
#change 0 to 255
def transform_format(val):
return 255 - val
tf_mask = np.ndarray((mask.shape[0],mask.shape[1]), np.int16)
tf_mask2 = np.ndarray((mask2.shape[0],mask2.shape[1]), np.int16)
for i in range(len(mask)):
tf_mask[i] = list(map(transform_format, mask[i]))
for i in range(len(mask2)):
tf_mask2[i] = list(map(transform_format, mask2[i]))
plt.imshow(tf_mask, cmap = "gray")
plt.figure(figsize=[16,14])
wc = WordCloud(max_font_size=50, min_font_size=6, max_words=100, background_color="white",
color_func=lambda *args, **kwargs: "darkorange",
mask = tf_mask, contour_width=3, contour_color='saddlebrown').generate(" ".join(got_wo_stopwords))
wc2 = WordCloud(max_font_size=50, min_font_size=6, max_words=100, background_color="white",
mask = tf_mask2, contour_width=1, contour_color='black').generate(" ".join(got_wo_stopwords))
plt.subplot(121)
plt.imshow(wc, interpolation = "lanczos")
plt.axis("off")
plt.subplot(122)
plt.imshow(wc2, interpolation = "lanczos")
plt.axis("off")
#The words are much more readable when written with different colors
#I used this image to generate the last wordcloud
rw_cutout = np.array(Image.open("red_wedding_sc_cut2.png"))
plt.imshow(rw_cutout)
#And that's even better :D
wc3 = WordCloud(background_color='white', mask=rw_cutout, stopwords = STOPWORDS, min_font_size=1)
wc3.generate(" ".join(got_wo_stopwords))
#Recoloring the wordcloud
image_colors = ImageColorGenerator(rw_cutout)
wc3.recolor(color_func=image_colors)
plt.figure(figsize=[14,10])
plt.imshow(wc3, interpolation = "lanczos")
plt.axis('off')
I followed this guide to generate these figures.
We could draw a general conclusion with the help of this visualization, that the viewers were quite shocked by the twists this episode caused in the plot of the Game of Thrones. This is supported by the high frequency of the strong words used in the tweets (swear words, kill, hate, speechless, shocked etc.).
Define a time window in which all tweets count as one document. Create the term-document matrix of the tweets for this time segmentation. Apply stemming and stopword filtering.
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import SnowballStemmer
c = CountVectorizer()
#Filtering stopwords
for w in specific_sws:
en_stop.append(w)
got2["text"] = got2.text.map( lambda x: " ".join(list(filter(filterOutStopword, x.split(" ")))))
#import nltk
#nltk.download('punkt')
got2.text
#Stemming tweets
snow = SnowballStemmer('english',ignore_stopwords=True)
got2.text = got2.text.map(lambda line: " ".join([snow.stem(word) for word in line.split(" ")]))
#I considered one document to be 1 minute in length, so I aggregated the tweets by the minute.
timeline2_df = pd.DataFrame(got2[["created_at","text"]].groupby("created_at")["text"].transform(" ".join).drop_duplicates())
timeline2_df["created_at"] = got2.loc[timeline2_df.index].created_at
timeline2_df = timeline2_df.set_index(np.arange(len(timeline2_df)))
timeline2_df
term_doc = pd.DataFrame(c.fit_transform(timeline2_df.text).toarray())
plt.figure(figsize=[25,160])
plt.imshow(term_doc) #It looks like, there's nothing to be seen, x axis is the document, y axis is the word
Apply a TF-IDF weighting scheme for the term-document matrix by hand (e.g. do not use a built-in vectorizer, but normalize by text length with a summation etc. numpy or pandas is strongly suggested). Then, choose a topic detection method such as LSI or LDA, and run it on your matrix. Try to interpret your results! Are your topics meaningful? Which topics are the most representative of your document?
#Calculating term frequency, showing it's normalized
#number of occurrance of the word in the document/ num of words in the document - on each word on each row
term_frequency = term_doc.divide( term_doc.sum(axis="columns"), axis="rows")
term_frequency.sum(axis="columns")
# idf number of documents / number of documents containing the word
temp = np.zeros(len(term_doc.columns))
for i in range(len(temp)):
temp[i] = sum(term_doc[i].map(lambda x: 1 if x > 0 else 0))
idf = np.log(len(term_doc) / temp)
tf_idf = term_frequency * idf
#LSA method: making SVD with the tf_idf, then taking the first n most significant vectors to score the documents
from sklearn.decomposition import TruncatedSVD
n_comp = 10
svd = TruncatedSVD(n_components = n_comp)
lsa = svd.fit_transform(tf_idf)
topic_encoded_df = pd.DataFrame(lsa, index = timeline2_df.created_at,
columns = [f"topic{i}" for i in range(n_comp)])
topic_encoded_df["text"] = timeline2_df.text
encoding_matrix = pd.DataFrame(svd.components_,
index = [f"topic{i}" for i in range(n_comp)],
columns = c.get_feature_names())
#Finding the most relevant words for each topic
top_by_topic = pd.DataFrame()
for col in encoding_matrix.index:
topic_temp_df = pd.DataFrame(encoding_matrix.T.sort_values(col, axis = "rows", ascending = False,
key = np.abs)[col][:15]).reset_index()
topic_temp_df.columns = [col, col+"_value"]
top_by_topic = pd.concat([top_by_topic, topic_temp_df], axis = 1)
top_by_topic
If we want to interpert the encoding matrix, we can list out the words along with their absolute value. This value describes how much a word is characterizing a certain topic both in a negative or a positive way. This is very much like the correlation coefficient.
First of all, I've experimented with 5-15 topics, but later decided to stick to 10. I've listed the 15 most significant word for each topic. Unfortunately, I couldn't find a way, to describe the topics in a meaningful way.
The best words for Topic 5 might be interesting, because the first 2 words are river and drown.
Some of the words seem to be gibberish or from another language. So, some topics might represent documents from a given language. But this approach is debatable, since one document contain multiple tweets, which might be written in different languages.
#Let's compare it with the sklearn library's tf-idf implementation
from sklearn.feature_extraction.text import TfidfVectorizer
n_comp = 10
svd2 = TruncatedSVD(n_components = n_comp)
tf_idf_sk = TfidfVectorizer(norm="l1")
X = tf_idf_sk.fit_transform(timeline2_df.text)
lsa2 = svd2.fit_transform(X)
topic_encoded_df = pd.DataFrame(lsa2, columns = [f"topic{i}" for i in range(n_comp)])
encoding_matrix2 = pd.DataFrame(svd2.components_,
index = [f"topic{i}" for i in range(n_comp)],
columns = tf_idf_sk.get_feature_names())
top_by_topic2 = pd.DataFrame()
for col in encoding_matrix2.index:
topic_temp_df2 = pd.DataFrame(encoding_matrix2.T.sort_values(col, axis = "rows", ascending = False,
key = np.abs)[col][:15]).reset_index()
topic_temp_df2.columns = [col, col+"_value"]
top_by_topic2 = pd.concat([top_by_topic2, topic_temp_df2], axis = 1)
top_by_topic2
By using the sci-kit learn library function we obtain a similar result: the grouping of the top words has a lot in common. Other than that, like in our solution from scratch, it doesn't seem to make too much sense to divide the corpora to topics with the use of these words.
Write an own name parser for the tweets, and consider all names that you find in the dataset as a node of a graph. Add 1 to the weight of an edge if two names occur in the same tweet. With the help of networkx, draw the weighted network of names from the text. Try to find a simple clustering algorithm in networkx, cluster the names in the dataset. Print or visualize your results!
This episode caused severe disappointments in many viewers, because of the sudden death of too many of the favourite characters. Search for some sentiment analysis method, and create a timeline of sentiments based on the tweet texts. Do the sentiments on Twitter reflect the time of the worst scene?